How PGSQL can facilitate the data processing workflow
Parfait Gasana
Data Analyst, Winston & Strawn
## Libraries
library(DBI)
Loading required package: methods
library(RPostgreSQL)
library(microbenchmark)
library(scales)
library(ggplot2)
options(repr.plot.width=15, repr.plot.height=5)
options(scipen=999)
Import CSV Data
Often one of the largest time and resource-intensive steps for data analysts
Requires disk space on shared network or local drives
Multiple files usually unmanaged across versions and teams
csv_import <- function() {
setwd("~/Documents/PGSQL")
bus_month <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
bus_rides <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')
rail_stops <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
rail_rides <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
}
Import SQL Data
Centralized data access locally or remotely
Relational model ensures no redundancy
Secured and robust data storage
Available backup with no need of local computers
Table and set relations seamless integrates into datasets
atomic columns and diverse rowset
sql_import <- function() {
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
user="ctadba", password="cta18!", port=5432)
bus_month <- dbGetQuery(conn, "SELECT * FROM bus_month")
bus_rides <- dbGetQuery(conn, "SELECT * FROM bus_rides")
rail_stops <- dbGetQuery(conn, "SELECT * FROM rail_stops")
rail_rides <- dbGetQuery(conn, "SELECT * FROM rail_rides")
dbDisconnect(conn)
}
summary(microbenchmark(csv_import))
summary(microbenchmark(sql_import))
View CSV Data
Adjusting different levels and reshaping for analytical purposes
bus_month_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
head(bus_month_csv)
bus_rides_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')
head(bus_rides_csv)
rail_stops_csv <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
names(rail_stops_csv) <- tolower(names(rail_stops_csv))
head(rail_stops_csv)
rail_rides_csv <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
head(rail_rides_csv)
bus_month_csv$Month_Beginning <- as.Date(bus_month_csv$Month_Beginning, format="%m/%d/%Y", origin="1970-01-01")
bus_rides_csv$date <- as.Date(bus_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")
rail_rides_csv$date <- as.Date(rail_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")
View SQL Data
Relational databases are planned systems
Constraints and referential integrity ensures stability and relationships
Query language provides expression, reliablibility and versatility
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
user="ctadba", password="cta18!", port=5432)
bus_month_sql <- dbGetQuery(conn, "SELECT * FROM bus_month")
head(bus_month_sql)
bus_rides_sql <- dbGetQuery(conn, "SELECT * FROM bus_rides")
head(bus_rides_sql)
rail_stations_sql <- dbGetQuery(conn, "SELECT * FROM rail_stations")
head(rail_stations_sql)
rail_rides_sql <- dbGetQuery(conn, "SELECT * FROM rail_rides")
head(rail_rides_sql)
Aggregate CSV Data
Complex processes require long piping/chaining of objects and methods
Language lacks set-based (i.e., relations, join, union) framework
# MERGE
agg_csv <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")
# AGGREGATE
agg_csv <- do.call(data.frame,
aggregate(rides ~ route + routename, agg_csv,
function(x) c(count=length(x), sum=sum(x), mean=mean(x),
median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])
agg_csv
Top 5 Bus Routes
# MERGE
agg_csv <- merge(subset(unique(bus_month_csv[c("route", "routename")]),
routename %in% c("79th", "Ashland", "Chicago", "Western", "Cottage Grove")),
transform(bus_rides_csv, year=format(date, "%Y")),
by="route")
# AGGREGATE
agg_csv <- do.call(data.frame,
aggregate(rides ~ routename + year, agg_csv,
function(x) c(count=length(x), sum=sum(x), mean=mean(x),
median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(routename, year),])
agg_csv
Graph CSV Data
seabornPalette <- c("#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
"#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
"#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
"#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
"#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
"#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
"#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
"#c44e52","#8172b2","#ccb974","#64b5cd")
ggplot(agg_csv, aes(year, rides.sum, fill=routename)) + geom_col(position = "dodge") +
labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
scale_y_continuous(expand = c(0, 0), label=comma) +
scale_fill_manual(values = seabornPalette) +
theme(legend.position="bottom",
plot.title = element_text(hjust=0.5, size=18),
axis.text.x = element_text(angle=0, hjust=0.5))

Aggregate SQL Data
Clear, compact declarative language with portability
Processing with virtual tables occurs behind the scene
Set-based framework facilitates blockwise, vectorized process
sql <- 'SELECT rt.route_name, COUNT(rd.rides) AS "count",
SUM(rd.rides) AS "sum",
AVG(rd.rides) AS "mean",
MEDIAN(rd.rides) AS "median",
MIN(rd.rides) AS "min",
MAX(rd.rides) AS "max"
FROM bus_routes rt
INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
GROUP BY rt.route_name
ORDER BY SUM(rd.rides) DESC'
agg_sql <- dbGetQuery(conn, sql)
agg_sql
sql <- 'SELECT rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer AS "year",
COUNT(rd.rides) AS "count",
SUM(rd.rides) AS "sum",
AVG(rd.rides) AS "mean",
MEDIAN(rd.rides) AS "median",
MIN(rd.rides) AS "min",
MAX(rd.rides) AS "max"
FROM bus_routes rt
INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
WHERE rt.route_name IN (\'79th\', \'Ashland\', \'Chicago\', \'Western\', \'Cottage Grove\')
GROUP BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer
ORDER BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer'
agg_sql <- dbGetQuery(conn, sql)
agg_sql
Graph SQL Data
ggplot(agg_sql, aes(year, sum, color=route_name)) +
geom_line(stat="identity") + geom_point(stat="identity") +
labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
scale_x_continuous("year", breaks=unique(agg_sql$year)) +
scale_y_continuous(expand = c(0, 0), label=comma) +
scale_color_manual(values = seabornPalette) +
theme(legend.position="bottom",
plot.title = element_text(hjust=0.5, size=18),
axis.text.x = element_text(angle=0, hjust=0.5))

CSV Data Diagnostics
Imperative nature of processing
Dense, nested calls for layered steps
Limited to application layer
# TRANSFORM
agg_csv <- transform(rail_rides_csv, year=format(date, "%Y"))
# AGGREGATE
agg_csv <- do.call(data.frame,
aggregate(rides ~ station_id + stationname + year, agg_csv,
function(x) c(count=length(x), sum=sum(x), mean=mean(x),
median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])
agg_csv
# RESHAPE
rail_lines <- c("red", "blue", "green", "brown", "purple", "purple_exp", "yellow", "pink", "orange")
rail_long <- reshape(setNames(rail_stops_csv[c("map_id", "station_name", "location", "red", "blue", "g",
"brn", "p", "pexp", "y", "pnk", "o")],
c("station_id", "station_name", "location", rail_lines)),
varying = rail_lines, v.names = "value",
timevar = "rail_line", times = rail_lines,
new.row.names = 1:10000, direction = "long")
# SUBSET
rail_long <- unique(subset(rail_long, value=='true')[c("station_id", "station_name", "location", "rail_line")])
# ORDER
rail_long <- with(rail_long, rail_long[order(station_id, rail_line),])
row.names(rail_long) <- NULL
rail_long
merge_rail <- merge(agg_csv, rail_long, by="station_id")
merge_rail$rides.total <- merge_rail$rides.sum / with(merge_rail, ave(station_id, station_id, year, FUN=length))
merge_rail
agg_csv <- aggregate(rides.total ~ year + rail_line, merge_rail, sum)
agg_csv <- with(agg_csv, agg_csv[order(rail_line, year),])
row.names(agg_csv) <- NULL
agg_csv
SQL Data Diagnostics
Complex processing still readable and maintainable
CTEs clearly show underlying tables and views without helper objects
Window functions allow useful inline calculations
Conditional aggregations clearly show reshaped data
Seamless casting and conversion of types with :: operator
sql <- 'WITH station_agg AS
(SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
r.station_id,
r.station_name,
COUNT(r.rides)::numeric(20,5) AS "count",
SUM(r.rides)::numeric(20,5) AS "sum",
AVG(r.rides)::numeric(20,5) AS "mean",
MEDIAN(r.rides)::numeric(20,5) AS "median",
MIN(r.rides)::numeric(20,5) AS "min",
MAX(r.rides)::numeric(20,5) AS "max"
FROM rail_rides r
GROUP BY DATE_PART(\'year\', r.ride_date),
r.station_id,
r.station_name
),
merge_rail AS
(SELECT s.*,
r.rail_line,
(s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
FROM station_agg s
INNER JOIN rail_stations r ON s.station_id = r.station_id
)
SELECT m."year", m.rail_line, SUM(m.rail_total) AS rail_total
FROM merge_rail m
GROUP BY m."year", m.rail_line
ORDER BY m.rail_line, m."year"'
agg_sql <- dbGetQuery(conn, sql)
agg_sql
cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")
ggplot(subset(agg_sql, year > 2012), aes(year, rail_total, fill=rail_line)) + geom_col(position = "dodge") +
labs(title="CTA System 'L' Lines - Total Rides By Year", x="Year", y="Rides") +
scale_x_continuous(expand = c(0,0), "year", breaks=unique(agg_sql$year)) +
scale_y_continuous(expand = c(0, 0), label=comma) +
scale_fill_manual(values = cta_palette) + guides(fill=guide_legend("Rail Lines", nrow=1)) +
theme(legend.position="bottom",
plot.title = element_text(hjust=0.5, size=18),
axis.text.x = element_text(angle=0, hjust=0.5))

Distribution
sql <- 'SELECT r.station_id, r.ride_date, r.station_name, s.rail_line, r.rides,
(r.rides / COUNT(*) OVER(PARTITION BY s.station_id, r.ride_date)) AS rides_total
FROM rail_rides r
INNER JOIN rail_stations s ON s.station_id = r.station_id'
hist_long <- dbGetQuery(conn, sql)
hist_long
ggplot(hist_long, aes(x=rides_total, fill=rail_line)) +
geom_histogram(data=subset(hist_long, rail_line == 'red'), bins=100) +
geom_histogram(data=subset(hist_long, rail_line == 'blue'), bins=100) +
geom_histogram(data=subset(hist_long, rail_line == 'brown'), bins=100) +
geom_histogram(data=subset(hist_long, rail_line == 'green'), bins=100) +
geom_histogram(data=subset(hist_long, rail_line == 'orange'), bins=100) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
scale_fill_manual(values = c(red = "#C60C30", blue = "#00A1DE", brown = "#62361B",
green = "#009B3A", orange = "#F9461C")) +
labs(title="CTA Ridership Distribution By Rail Line", fill="Rail Line") +
theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(transform(hist_long, year = format(ride_date, '%Y')),
aes(x=year, y=rides_total, fill=year)) +
geom_boxplot() + guides(fill=FALSE) +
scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
labs(title="CTA Ridership Boxplot By Year") +
theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(hist_long, aes(x=rail_line, y=rides_total, fill=rail_line)) +
geom_boxplot() + guides(fill=FALSE) +
scale_fill_manual(values = cta_palette) +
scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
labs(title="CTA Ridership Boxplot By Rail Line") +
theme(plot.title = element_text(hjust=0.5, size=18))

ggplot(subset(within(hist_long, { year <- format(ride_date, '%Y')
year <- as.integer(as.character(year))
}),
year >= 2015 & year <= 2017),
aes(x=factor(year), y=rides_total, fill=rail_line)) +
geom_boxplot() + guides(fill=guide_legend("Rail Lines", nrow=1)) +
scale_fill_manual(values = cta_palette) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
labs(title="CTA Ridership Boxplot By Rail Line, 2015-2017", x="Year", y="Rides") +
theme(legend.position="bottom", plot.title = element_text(hjust=0.5, size=18))

Correlation
sql <- 'WITH station_agg AS
(SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
r.station_id,
r.station_name,
COUNT(r.rides)::numeric(20,5) AS "count",
SUM(r.rides)::numeric(20,5) AS "sum",
AVG(r.rides)::numeric(20,5) AS "mean",
MEDIAN(r.rides)::numeric(20,5) AS "median",
MIN(r.rides)::numeric(20,5) AS "min",
MAX(r.rides)::numeric(20,5) AS "max"
FROM rail_rides r
GROUP BY DATE_PART(\'year\', r.ride_date),
r.station_id,
r.station_name
),
merge_rail AS
(SELECT s.*,
r.rail_line,
(s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
FROM station_agg s
INNER JOIN rail_stations r ON s.station_id = r.station_id
)
SELECT m."year",
SUM(rail_total) FILTER (WHERE rail_line = \'blue\') AS blue,
SUM(rail_total) FILTER (WHERE rail_line = \'brown\') AS brown,
SUM(rail_total) FILTER (WHERE rail_line = \'green\') AS green,
SUM(rail_total) FILTER (WHERE rail_line = \'orange\') AS orange,
SUM(rail_total) FILTER (WHERE rail_line = \'pink\') AS pink,
SUM(rail_total) FILTER (WHERE rail_line = \'purple\') AS purple,
SUM(rail_total) FILTER (WHERE rail_line = \'purple_exp\') AS purple_exp,
SUM(rail_total) FILTER (WHERE rail_line = \'red\') AS red,
SUM(rail_total) FILTER (WHERE rail_line = \'yellow\') AS yellow
FROM merge_rail m
GROUP BY m."year"
ORDER BY m."year"'
wide_sql <- dbGetQuery(conn, sql)
wide_sql
cor(wide_sql[-1], use = "complete.obs", method="pearson")
blue brown green orange pink purple
blue 1.0000000 0.9735257 0.8909543 0.9490871 0.8764534 0.7780920
brown 0.9735257 1.0000000 0.8839102 0.9165204 0.8888845 0.7446475
green 0.8909543 0.8839102 1.0000000 0.9295203 0.8768352 0.7684971
orange 0.9490871 0.9165204 0.9295203 1.0000000 0.8697865 0.8742380
pink 0.8764534 0.8888845 0.8768352 0.8697865 1.0000000 0.5573137
purple 0.7780920 0.7446475 0.7684971 0.8742380 0.5573137 1.0000000
purple_exp 0.9707673 0.9785738 0.9264667 0.9722522 0.8994040 0.8304641
red 0.8931410 0.8892051 0.7947539 0.9334301 0.7611228 0.9142396
yellow 0.7415022 0.7744762 0.7812455 0.7698821 0.6214899 0.7375434
purple_exp red yellow
blue 0.9707673 0.8931410 0.7415022
brown 0.9785738 0.8892051 0.7744762
green 0.9264667 0.7947539 0.7812455
orange 0.9722522 0.9334301 0.7698821
pink 0.8994040 0.7611228 0.6214899
purple 0.8304641 0.9142396 0.7375434
purple_exp 1.0000000 0.9355244 0.7854706
red 0.9355244 1.0000000 0.7241108
yellow 0.7854706 0.7241108 1.0000000
T-tests
combns <- as.list(outer(rail_lines, rail_lines, function(x, y) ifelse(x==y, NA, paste(x, y))))
combns <- lapply(combns[!is.na(combns)], function(x) strsplit(x, split=" ")[[1]])
ttest_matrix <- sapply(combns, function(x){
t <- t.test(wide_sql[[x[1]]], wide_sql[[x[2]]])
c(statistic = t$statistic, p_value = t$p.value)
})
ttest_df <- data.frame(x = sapply(combns, "[", 1),
y = sapply(combns, "[", 2),
statistic = ttest_matrix[1,],
p_value = ttest_matrix[2,])
ttest_df <- with(ttest_df, ttest_df[order(x, y),])
ttest_df
by(ttest_df, ttest_df$x, function(sub)
ggplot(sub, aes(x, statistic, fill=y)) + geom_col(position = "dodge") +
labs(title=paste0("CTA System 'L' Lines - Pairwise T-tests for ",
toupper(substr(sub$x[[1]], 1, 1)),
substr(sub$x[[1]], 2, nchar(as.character(sub$x[[1]]))), " Line"),
x="Rail Line", y="T-test Stat") +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0, 0), label=comma) +
scale_fill_manual(values = cta_palette[!names(cta_palette)==sub$x[[1]]]) +
guides(fill=guide_legend("Rail Lines", nrow=1)) +
theme(legend.position="bottom",
plot.title = element_text(hjust=0.5, size=18, colour=cta_palette[names(cta_palette)==sub$x[[1]]]),
axis.text.x = element_text(angle=0, hjust=0.5))
)
ttest_df$x: blue

--------------------------------------------------------
ttest_df$x: brown

--------------------------------------------------------
ttest_df$x: green

--------------------------------------------------------
ttest_df$x: orange

--------------------------------------------------------
ttest_df$x: pink

--------------------------------------------------------
ttest_df$x: purple

--------------------------------------------------------
ttest_df$x: purple_exp

--------------------------------------------------------
ttest_df$x: red

--------------------------------------------------------
ttest_df$x: yellow

CSV Regression
Right-Hand Side Variables
Again, requires maintenance and storage on disk space
Again, requires load time from non-centralized paths
Again, requires any ad-hoc munging for usability
# Source: St. Louis Federal Reserve Bank
unemployment_df <- read.csv('Chicago_Unemployment_Rates.csv')
unemployment_df$Date <- as.Date(unemployment_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(unemployment_df)
# Source: U.S. Department of Energy: EIA
gas_prices_df <- read.csv('US_Gas_Prices.csv')
gas_prices_df$Date <- as.Date(gas_prices_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(gas_prices_df)
# Source: U.S. National Oceanic and Atmospheric Administration (NOAA)
weather_df <- read.csv('Chicago_Weather_Data.csv')
weather_df$Date <- as.Date(weather_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(weather_df)
Bus Model Data
Echoes set-based joins
Repetitive sourcing of same object
Nested dense processing of steps
bus_model_data <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")
bus_model_data <- merge(bus_model_data, unemployment_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, gas_prices_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, weather_df, by.x='date', by.y='Date')
head(bus_model_data)
Add Seasons
bus_model_data$normalized_dt <- as.POSIXlt(bus_model_data$date)
bus_model_data$normalized_dt$year <- bus_model_data$normalized_dt$year +
(2099 - as.integer(format(bus_model_data$date, "%Y")))
bus_model_data$normalized_dt <- as.Date(bus_model_data$normalized_dt)
bus_model_data$season <- ifelse(bus_model_data$normalized_dt >= '2099-01-01' &
bus_model_data$normalized_dt < '2099-03-19', 'winter',
ifelse(bus_model_data$normalized_dt >= '2099-03-20' &
bus_model_data$normalized_dt < '2099-06-19', 'spring',
ifelse(bus_model_data$normalized_dt >= '2099-06-20' &
bus_model_data$normalized_dt < '2099-09-19', 'summer',
ifelse(bus_model_data$normalized_dt >= '2099-09-20' &
bus_model_data$normalized_dt < '2099-12-19', 'fall',
ifelse(bus_model_data$normalized_dt >= '2099-12-20' &
bus_model_data$normalized_dt < '2099-12-31', 'winter',
NA)
)
)
)
)
bus_model_data[sample(nrow(bus_model_data), 10), c("normalized_dt", "date", "season")]
bus_model_data$normalized_dt <- NULL
Bus Modeling (Ordinary Least Squares)
model <- lm(rides ~ daytype + season + UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth,
data = bus_model_data)
summary(model)
Call:
lm(formula = rides ~ daytype + season + UE_Rate + Gas_Price +
AvgTemp + Precipitation + SnowDepth, data = bus_model_data)
Residuals:
Min 1Q Median 3Q Max
-7661 -4936 -2141 3321 38050
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5209.6046 51.7800 100.610 < 0.0000000000000002 ***
daytypeU -1499.4540 32.9522 -45.504 < 0.0000000000000002 ***
daytypeW 681.4509 25.0499 27.204 < 0.0000000000000002 ***
seasonspring -255.0354 22.8648 -11.154 < 0.0000000000000002 ***
seasonsummer -644.0198 28.4167 -22.663 < 0.0000000000000002 ***
seasonwinter -373.8617 26.7128 -13.996 < 0.0000000000000002 ***
UE_Rate 99.9726 4.1670 23.991 < 0.0000000000000002 ***
Gas_Price -27.0833 10.4389 -2.594 0.00947 **
AvgTemp 11.9167 0.7148 16.671 < 0.0000000000000002 ***
Precipitation -339.3977 23.2796 -14.579 < 0.0000000000000002 ***
SnowDepth -9.7763 5.1624 -1.894 0.05826 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6401 on 665624 degrees of freedom
(148560 observations deleted due to missingness)
Multiple R-squared: 0.01444, Adjusted R-squared: 0.01443
F-statistic: 975.5 on 10 and 665624 DF, p-value: < 0.00000000000000022
Rail Model Data
Assign Latitude and Longitude
rail_long$latitude <- as.numeric(gsub("\\(", "", gsub(",", "", sapply(as.character(rail_long$location),
function(x) strsplit(x, split="\\s+")[[1]][1])))
)
rail_long$longitude <- as.numeric(gsub("\\)", "", sapply(as.character(rail_long$location),
function(x) strsplit(x, split="\\s+")[[1]][2]))
)
rail_long[sample(nrow(rail_long), 10), c("location", "latitude", "longitude")]
rail_model_data <- merge(rail_long, rail_rides_csv, by="station_id")
rail_model_data$raw <- rail_model_data$rides
rail_model_data$rides <- with(rail_model_data, rides /
ave(station_id, station_id, date, FUN=length))
rail_model_data <- merge(rail_model_data, unemployment_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, gas_prices_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, weather_df, by.x='date', by.y='Date')
head(rail_model_data, 10)
Add Seasons
rail_model_data$normalized_dt <- as.POSIXlt(rail_model_data$date)
rail_model_data$normalized_dt$year <- rail_model_data$normalized_dt$year +
(2099 - as.integer(format(rail_model_data$date, "%Y")))
rail_model_data$normalized_dt <- as.Date(rail_model_data$normalized_dt)
rail_model_data$season <- ifelse(rail_model_data$normalized_dt >= '2099-01-01' &
rail_model_data$normalized_dt < '2099-03-19', 'winter',
ifelse(rail_model_data$normalized_dt >= '2099-03-20' &
rail_model_data$normalized_dt < '2099-06-19', 'spring',
ifelse(rail_model_data$normalized_dt >= '2099-06-20' &
rail_model_data$normalized_dt < '2099-09-19', 'summer',
ifelse(rail_model_data$normalized_dt >= '2099-09-20' &
rail_model_data$normalized_dt < '2099-12-19', 'fall',
ifelse(rail_model_data$normalized_dt >= '2099-12-20' &
rail_model_data$normalized_dt < '2099-12-31', 'winter',
NA)
)
)
)
)
rail_model_data[sample(nrow(rail_model_data), 10), c("normalized_dt", "date", "season")]
rail_model_data$normalized_dt <- NULL
Rail Modeling (Ordinary Least Squares)
model <- lm(rides ~ daytype + season + latitude + longitude + rail_line +
UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth,
data = rail_model_data)
summary(model)
Call:
lm(formula = rides ~ daytype + season + latitude + longitude +
rail_line + UE_Rate + Gas_Price + AvgTemp + Precipitation +
SnowDepth, data = rail_model_data)
Residuals:
Min 1Q Median 3Q Max
-6252.1 -933.1 -256.4 537.0 30569.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -67697.5885 3581.3622 -18.90 < 0.0000000000000002
daytypeU -453.5025 6.6974 -67.71 < 0.0000000000000002
daytypeW 1160.6410 5.3328 217.64 < 0.0000000000000002
seasonspring -178.8823 5.3762 -33.27 < 0.0000000000000002
seasonsummer -189.5721 6.6870 -28.35 < 0.0000000000000002
seasonwinter -157.0931 6.2731 -25.04 < 0.0000000000000002
latitude -3995.6159 36.9560 -108.12 < 0.0000000000000002
longitude -2703.3418 44.1307 -61.26 < 0.0000000000000002
rail_linebrown -1180.7391 7.2879 -162.01 < 0.0000000000000002
rail_linegreen -2070.2382 6.9165 -299.32 < 0.0000000000000002
rail_lineorange -1077.6339 8.3798 -128.60 < 0.0000000000000002
rail_linepink -2211.9982 7.4465 -297.05 < 0.0000000000000002
rail_linepurple -2027.7884 11.2705 -179.92 < 0.0000000000000002
rail_linepurple_exp -1547.7298 7.7477 -199.77 < 0.0000000000000002
rail_linered 1640.2397 6.9847 234.83 < 0.0000000000000002
rail_lineyellow -1400.7779 17.7956 -78.72 < 0.0000000000000002
UE_Rate -5.1186 0.9920 -5.16 0.000000247
Gas_Price 185.7348 2.4565 75.61 < 0.0000000000000002
AvgTemp 5.0953 0.1679 30.34 < 0.0000000000000002
Precipitation -82.5539 5.3670 -15.38 < 0.0000000000000002
SnowDepth 1.4842 1.2067 1.23 0.219
(Intercept) ***
daytypeU ***
daytypeW ***
seasonspring ***
seasonsummer ***
seasonwinter ***
latitude ***
longitude ***
rail_linebrown ***
rail_linegreen ***
rail_lineorange ***
rail_linepink ***
rail_linepurple ***
rail_linepurple_exp ***
rail_linered ***
rail_lineyellow ***
UE_Rate ***
Gas_Price ***
AvgTemp ***
Precipitation ***
SnowDepth
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1850 on 1006966 degrees of freedom
(223613 observations deleted due to missingness)
Multiple R-squared: 0.4033, Adjusted R-squared: 0.4033
F-statistic: 3.403e+04 on 20 and 1006966 DF, p-value: < 0.00000000000000022
SQL Regression
Advanced preparation of data
Materialized view facilitates reproducible research
Compact and straightforward data sourcing
Bus Modeling Data
CREATE MATERIALIZED VIEW Bus_Model_Data AS
SELECT b.id, b.route_id, b.ride_date, b.day_type, b.rides, r.route_name,
CASE
WHEN b.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
WHEN b.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
WHEN b.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
WHEN b.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
WHEN b.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
ELSE NULL
END As season,
ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
FROM
(
SELECT id, route_id, day_type, rides, ride_date,
ride_date + (2099 - date_part('year', ride_date) ||' year')::interval as normalized_date
FROM bus_rides
) b
INNER JOIN bus_routes r ON b.route_id = r.route_id
INNER JOIN unemployment_rates ue ON ue.ue_date = b.ride_date
INNER JOIN gas_prices g ON g.gas_date = b.ride_date
INNER JOIN weather_data w ON w.weather_date = b.ride_date
ORDER BY b.ride_date, NULLIF(regexp_replace(b.route_id, '\D', '', 'g'), '')::int;
REFRESH MATERIALIZED VIEW Bus_Model_Data;
bus_model_data <- dbGetQuery(conn, "SELECT * FROM bus_model_data")
head(bus_model_data)
Bus Modeling (Ordinary Least Squares)
model <- lm(rides ~ day_type + season + ue_rate + gas_price + avg_temp + precipitation + snow_depth,
data = bus_model_data)
summary(model)
Call:
lm(formula = rides ~ day_type + season + ue_rate + gas_price +
avg_temp + precipitation + snow_depth, data = bus_model_data)
Residuals:
Min 1Q Median 3Q Max
-7813 -5072 -2142 3487 37904
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5290.1066 52.9123 99.979 <0.0000000000000002 ***
day_typeU -1512.5779 33.7077 -44.873 <0.0000000000000002 ***
day_typeW 719.1988 25.6034 28.090 <0.0000000000000002 ***
seasonspring -259.2512 23.5676 -11.000 <0.0000000000000002 ***
seasonsummer -644.0360 29.1817 -22.070 <0.0000000000000002 ***
seasonwinter -384.3928 27.1703 -14.148 <0.0000000000000002 ***
ue_rate 99.6607 4.2720 23.329 <0.0000000000000002 ***
gas_price -19.6097 10.7135 -1.830 0.0672 .
avg_temp 12.1726 0.7301 16.673 <0.0000000000000002 ***
precipitation -350.3917 23.6077 -14.842 <0.0000000000000002 ***
snow_depth -9.6659 5.3147 -1.819 0.0690 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6483 on 648166 degrees of freedom
(133403 observations deleted due to missingness)
Multiple R-squared: 0.01478, Adjusted R-squared: 0.01477
F-statistic: 972.7 on 10 and 648166 DF, p-value: < 0.00000000000000022
Rail Modeling Data
CREATE MATERIALIZED VIEW Rail_Model_Data AS
SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw,
(r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
CASE
WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
ELSE NULL
END As season,
REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
s.rail_line, s.ada, s.direction,
ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
FROM
(
SELECT id, station_id, station_name, day_type, rides, ride_date,
ride_date + (2099 - date_part('year', ride_date) ||' year')::interval as normalized_date
FROM rail_rides
)r
INNER JOIN rail_stations s ON s.station_id = r.station_id
INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
INNER JOIN gas_prices g ON g.gas_date = r.ride_date
INNER JOIN weather_data w ON w.weather_date = r.ride_date
ORDER BY r.ride_date, r.station_id;
REFRESH MATERIALIZED VIEW Rail_Model_Data;
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")
head(rail_model_data)
Rail Modeling (Ordinary Least Squares)
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line +
ue_rate + gas_price + avg_temp + precipitation + snow_depth,
data = rail_model_data)
summary(model)
Call:
lm(formula = rides ~ day_type + season + latitude + longitude +
rail_line + ue_rate + gas_price + avg_temp + precipitation +
snow_depth, data = rail_model_data)
Residuals:
Min 1Q Median 3Q Max
-6249.5 -932.9 -256.2 536.9 30570.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -68149.8607 3556.3642 -19.163 < 0.0000000000000002
day_typeU -454.4689 6.6549 -68.290 < 0.0000000000000002
day_typeW 1157.9025 5.2941 218.717 < 0.0000000000000002
seasonspring -177.8150 5.3603 -33.172 < 0.0000000000000002
seasonsummer -185.4726 6.6439 -27.916 < 0.0000000000000002
seasonwinter -160.7995 6.1740 -26.044 < 0.0000000000000002
latitude -3995.1405 36.6987 -108.863 < 0.0000000000000002
longitude -2708.3027 43.8227 -61.801 < 0.0000000000000002
rail_linebrown -1180.4292 7.2371 -163.109 < 0.0000000000000002
rail_linegreen -2069.6169 6.8682 -301.334 < 0.0000000000000002
rail_lineorange -1077.4023 8.3213 -129.476 < 0.0000000000000002
rail_linepink -2211.5330 7.3945 -299.076 < 0.0000000000000002
rail_linepurple -2027.3643 11.1919 -181.146 < 0.0000000000000002
rail_linepurple_exp -1547.6652 7.6936 -201.163 < 0.0000000000000002
rail_linered 1641.3142 6.9360 236.637 < 0.0000000000000002
rail_lineyellow -1400.4416 17.6707 -79.252 < 0.0000000000000002
ue_rate -5.3018 0.9843 -5.387 0.0000000718
gas_price 185.5411 2.4399 76.046 < 0.0000000000000002
avg_temp 5.0833 0.1659 30.635 < 0.0000000000000002
precipitation -83.1416 5.2864 -15.727 < 0.0000000000000002
snow_depth 1.6746 1.2015 1.394 0.163
(Intercept) ***
day_typeU ***
day_typeW ***
seasonspring ***
seasonsummer ***
seasonwinter ***
latitude ***
longitude ***
rail_linebrown ***
rail_linegreen ***
rail_lineorange ***
rail_linepink ***
rail_linepurple ***
rail_linepurple_exp ***
rail_linered ***
rail_lineyellow ***
ue_rate ***
gas_price ***
avg_temp ***
precipitation ***
snow_depth
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1850 on 1021214 degrees of freedom
(209365 observations deleted due to missingness)
Multiple R-squared: 0.4031, Adjusted R-squared: 0.4031
F-statistic: 3.449e+04 on 20 and 1021214 DF, p-value: < 0.00000000000000022
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
[1] TRUE
Conclusions
Postgres provides a stable, centralized, repository for data sourcing
Postgres maintains an expressive SQL dialect for data processing
Postgres supports data science with vectorized methods and reproducibility
---
title: "Leveraging PostgreSQL in Data Science with R"
output: html_notebook
---

<div style="float:right"><img src="postgresql_r.png" height="200" width="200"/></div>

<div style="font-size: 24px;">How PGSQL can facilitate the data processing workflow</div>
## Parfait Gasana ##
<div style="font-size: 20px;">Data Analyst, Winston & Strawn</div>

<style>
div.blue pre { background-color: #EBF4FA; }
</style>

```{r}
## Libraries
library(DBI)
library(RPostgreSQL)
library(microbenchmark)

library(scales)
library(ggplot2)

options(repr.plot.width=15, repr.plot.height=5)
options(scipen=999)

```

## <span style="color: #646464">Import CSV Data</span>

- ### <span style="color: #646464">Often one of the largest time and resource-intensive steps for data analysts</span> ###
- ### <span style="color: #646464">Requires disk space on shared network or local drives</span> ###
- ### <span style="color: #646464">Multiple files usually unmanaged across versions and teams</span> ###

```{r}
csv_import <- function() {
    setwd("~/Documents/PGSQL")
    
    bus_month <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
    bus_rides <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')

    rail_stops <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
    rail_rides <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
}
```

## <span style="color:#336791">Import SQL Data</span>

- ### <span style="color:#336791">Centralized data access locally or remotely</span> ###
    - #### <span style="color:#336791">Relational model ensures no redundancy</span> ####
- ### <span style="color:#336791">Secured and robust data storage</span> ###
    - #### <span style="color:#336791">Available backup with no need of local computers</span> ####
- ### <span style="color:#336791">Table and set relations seamless integrates into datasets</span> ###
    - #### <span style="color:#336791">atomic columns and diverse rowset</span> ####

<div class="blue">
```{r}
sql_import <- function() {
    conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                      user="ctadba", password="cta18!", port=5432)

    bus_month <- dbGetQuery(conn, "SELECT * FROM bus_month")
    bus_rides <- dbGetQuery(conn, "SELECT * FROM bus_rides")

    rail_stops <- dbGetQuery(conn, "SELECT * FROM rail_stops")
    rail_rides <- dbGetQuery(conn, "SELECT * FROM rail_rides")

    dbDisconnect(conn)
}

summary(microbenchmark(csv_import))
summary(microbenchmark(sql_import))

```
</div>

## <span style="color: #646464">View CSV Data</span>

- ### <span style="color: #646464">Often requires ad-hoc munging: re-formatting or conversion of columns</span> ###
- ### <span style="color: #646464">Cleaning out reports and metadata from actual data on the fly</span> ###
- ### <span style="color: #646464">Adjusting different levels and reshaping for analytical purposes</span> ###

```{r}
bus_month_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Monthly_Day-Type_Averages___Totals.csv')
head(bus_month_csv)

bus_rides_csv <- read.csv('CTA_-_Ridership_-_Bus_Routes_-_Daily_Totals_by_Route.csv')
head(bus_rides_csv)

rail_stops_csv <- read.csv('CTA_-_System_Information_-_List_of__L__Stops.csv')
names(rail_stops_csv) <- tolower(names(rail_stops_csv))
head(rail_stops_csv)

rail_rides_csv <- read.csv('CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv')
head(rail_rides_csv)
```

```{r}
bus_month_csv$Month_Beginning <- as.Date(bus_month_csv$Month_Beginning, format="%m/%d/%Y", origin="1970-01-01")

bus_rides_csv$date <- as.Date(bus_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")

rail_rides_csv$date <- as.Date(rail_rides_csv$date, format="%m/%d/%Y", origin="1970-01-01")
```

## <span style="color:#336791">View SQL Data</span>

- ### <span style="color:#336791">Relational databases are planned systems</span> ###
- ### <span style="color:#336791">Constraints and referential integrity ensures stability and relationships</span> ###
- ### <span style="color:#336791">Query language provides expression, reliablibility and versatility</span> ###

<div class="blue">
```{r}
conn <- dbConnect(RPostgreSQL::PostgreSQL(), host="10.0.0.220", dbname="cta",
                  user="ctadba", password="cta18!", port=5432)

bus_month_sql <- dbGetQuery(conn, "SELECT * FROM bus_month")
head(bus_month_sql)

bus_rides_sql <- dbGetQuery(conn, "SELECT * FROM bus_rides")
head(bus_rides_sql)

rail_stations_sql <- dbGetQuery(conn, "SELECT * FROM rail_stations")
head(rail_stations_sql)

rail_rides_sql <- dbGetQuery(conn, "SELECT * FROM rail_rides")
head(rail_rides_sql)

```
</div>


## <span style="color: #646464">Aggregate CSV Data</span> 

- ### <span style="color: #646464">Data tool syntax can be formidable for newcomers or returning users</span> ###
- ### <span style="color: #646464">Complex processes require long piping/chaining of objects and methods</span> ###
- ### <span style="color: #646464">Language lacks set-based (i.e., relations, join, union) framework</span> ###
  

```{r}
# MERGE
agg_csv <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ route + routename, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv
```


### <span style="color: #646464">Top 5 Bus Routes</span> 

```{r}
# MERGE
agg_csv <- merge(subset(unique(bus_month_csv[c("route", "routename")]), 
                        routename %in% c("79th", "Ashland", "Chicago", "Western", "Cottage Grove")),
                 transform(bus_rides_csv, year=format(date, "%Y")),
                 by="route")

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ routename + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(routename, year),])

agg_csv
```


## <span style="color: #646464">Graph CSV Data</span> 

```{r fig1, fig.height = 4, fig.width = 10, fig.align = "center"}

seabornPalette <- c("#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2",
                    "#ccb974","#64b5cd","#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd",
                    "#4c72b0","#55a868","#c44e52","#8172b2","#ccb974","#64b5cd","#4c72b0","#55a868",
                    "#c44e52","#8172b2","#ccb974","#64b5cd")

ggplot(agg_csv, aes(year, rides.sum, fill=routename)) + geom_col(position = "dodge") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

## <span style="color:#336791">Aggregate SQL Data</span> 

- ### <span style="color:#336791">Clear, compact declarative language with portability</span> ####
- ### <span style="color:#336791">Processing with virtual tables occurs behind the scene</span> ###
- ### <span style="color:#336791">Set-based framework facilitates blockwise, vectorized process</span> ###

<div class="blue">
```{r}
sql <- 'SELECT rt.route_name, COUNT(rd.rides) AS "count", 
                              SUM(rd.rides) AS "sum", 
                              AVG(rd.rides) AS "mean", 
                              MEDIAN(rd.rides) AS "median",
                              MIN(rd.rides) AS "min", 
                              MAX(rd.rides) AS "max"
        FROM bus_routes rt
        INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
        GROUP BY rt.route_name
        ORDER BY SUM(rd.rides) DESC'

agg_sql <- dbGetQuery(conn, sql)

agg_sql
```

```{r}
sql <- 'SELECT rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer AS "year", 
             COUNT(rd.rides) AS "count", 
             SUM(rd.rides) AS "sum", 
             AVG(rd.rides) AS "mean", 
             MEDIAN(rd.rides) AS "median",
             MIN(rd.rides) AS "min", 
             MAX(rd.rides) AS "max"
      FROM bus_routes rt
      INNER JOIN bus_rides rd ON rt.route_id = rd.route_id
      WHERE rt.route_name IN (\'79th\', \'Ashland\', \'Chicago\', \'Western\', \'Cottage Grove\')
      GROUP BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer
      ORDER BY rt.route_name, DATE_PART(\'year\', rd.ride_date)::integer'

agg_sql <- dbGetQuery(conn, sql)

agg_sql
```
</div>

## <span style="color:#336791">Graph SQL Data</span>

<div class="blue">
```{r fig2, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(agg_sql, aes(year, sum, color=route_name)) + 
  geom_line(stat="identity") + geom_point(stat="identity") +
  labs(title="Top 5 CTA 'L' Bus Routes", x="Year", y="Rides") +
  scale_x_continuous("year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_color_manual(values = seabornPalette) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```
</div>

## <span style="color: #646464">CSV Data Diagnostics</span> ##

- ### <span style="color: #646464">Imperative nature of processing</span> ###
- ### <span style="color: #646464">Dense, nested calls for layered steps</span> ###
- ### <span style="color: #646464">Limited to application layer</span> ###

```{r}
# TRANSFORM
agg_csv <- transform(rail_rides_csv, year=format(date, "%Y"))

# AGGREGATE
agg_csv <- do.call(data.frame, 
                   aggregate(rides ~ station_id + stationname + year, agg_csv, 
                             function(x) c(count=length(x), sum=sum(x), mean=mean(x), 
                                           median=median(x), min=min(x), max=max(x))))
# ORDER
agg_csv <- with(agg_csv, agg_csv[order(-rides.sum),])

agg_csv
```


```{r}
# RESHAPE
rail_lines <- c("red", "blue", "green", "brown", "purple", "purple_exp", "yellow", "pink", "orange")

rail_long <- reshape(setNames(rail_stops_csv[c("map_id", "station_name", "location", "red", "blue", "g", 
                                               "brn", "p", "pexp", "y", "pnk", "o")],
                              c("station_id", "station_name", "location", rail_lines)),
                     varying = rail_lines, v.names = "value", 
                     timevar = "rail_line", times = rail_lines,
                     new.row.names = 1:10000, direction = "long")

# SUBSET
rail_long <- unique(subset(rail_long, value=='true')[c("station_id", "station_name", "location", "rail_line")])

# ORDER
rail_long <- with(rail_long, rail_long[order(station_id, rail_line),])
row.names(rail_long) <- NULL

rail_long
```


```{r}
merge_rail <- merge(agg_csv, rail_long, by="station_id")

merge_rail$rides.total <- merge_rail$rides.sum / with(merge_rail, ave(station_id, station_id, year, FUN=length))

merge_rail
```


```{r}
agg_csv <- aggregate(rides.total ~ year + rail_line, merge_rail, sum)
agg_csv <- with(agg_csv, agg_csv[order(rail_line, year),])
row.names(agg_csv) <- NULL

agg_csv
```

## <span style="color:#336791">SQL Data Diagnostics</span>

- ### <span style="color:#336791">Complex processing still readable and maintainable</span> ###
- ### <span style="color:#336791">CTEs clearly show underlying tables and views without helper objects</span> ###
- ### <span style="color:#336791">Window functions allow useful inline calculations</span> ###
- ### <span style="color:#336791">Conditional aggregations clearly show reshaped data</span> ###
- ### <span style="color:#336791">Seamless casting and conversion of types with `::` operator</span> ###

<div class="blue">
```{r}
sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year", m.rail_line, SUM(m.rail_total)  AS rail_total
      FROM merge_rail m
      GROUP BY m."year", m.rail_line
      ORDER BY m.rail_line, m."year"'
  
agg_sql <- dbGetQuery(conn, sql)

agg_sql
```


```{r fig3, fig.height = 4, fig.width = 10, fig.align = "center"}
cta_palette <- c(blue="#00A1DE", brown="#62361B", green="#009B3A", orange="#F9461C", pink="#E27EA6",
                 purple="#522398", purple_exp="#8059BA", red="#C60C30", yellow="#F9E300")

ggplot(subset(agg_sql, year > 2012), aes(year, rail_total, fill=rail_line)) + geom_col(position = "dodge") +
  labs(title="CTA System 'L' Lines - Total Rides By Year", x="Year", y="Rides") +
  scale_x_continuous(expand = c(0,0), "year", breaks=unique(agg_sql$year)) +
  scale_y_continuous(expand = c(0, 0), label=comma) +
  scale_fill_manual(values = cta_palette) + guides(fill=guide_legend("Rail Lines", nrow=1)) +
  theme(legend.position="bottom",
        plot.title = element_text(hjust=0.5, size=18),
        axis.text.x = element_text(angle=0, hjust=0.5))
```

</div>

## <span style="color:#336791;">Distribution</span

```{r}
sql <- 'SELECT r.station_id, r.ride_date, r.station_name, s.rail_line, r.rides,
               (r.rides / COUNT(*) OVER(PARTITION BY s.station_id, r.ride_date)) AS rides_total
        FROM rail_rides r
        INNER JOIN rail_stations s ON s.station_id = r.station_id'

hist_long <- dbGetQuery(conn, sql)

hist_long
```

```{r fig4, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(hist_long, aes(x=rides_total, fill=rail_line)) +
   geom_histogram(data=subset(hist_long, rail_line == 'red'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'blue'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'brown'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'green'), bins=100) +
   geom_histogram(data=subset(hist_long, rail_line == 'orange'), bins=100) +
   scale_x_continuous(expand = c(0, 0)) +
   scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
   scale_fill_manual(values = c(red = "#C60C30", blue = "#00A1DE", brown = "#62361B",
                                green = "#009B3A", orange = "#F9461C")) +
   labs(title="CTA Ridership Distribution By Rail Line", fill="Rail Line") +
   theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig5, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(transform(hist_long, year = format(ride_date, '%Y')), 
       aes(x=year, y=rides_total, fill=year)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Year") +
    theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig6, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(hist_long, aes(x=rail_line, y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=FALSE) +
    scale_fill_manual(values = cta_palette) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line") +
    theme(plot.title = element_text(hjust=0.5, size=18))
```

```{r fig7, fig.height = 4, fig.width = 10, fig.align = "center"}
ggplot(subset(within(hist_long, { year <- format(ride_date, '%Y')
                                  year <- as.integer(as.character(year))
                                }),
              year >= 2015 & year <= 2017), 
       aes(x=factor(year), y=rides_total, fill=rail_line)) + 
    geom_boxplot() + guides(fill=guide_legend("Rail Lines", nrow=1)) +
    scale_fill_manual(values = cta_palette) +
    scale_x_discrete(expand = c(0, 0)) +
    scale_y_continuous(expand = c(0, 0), lim=c(0,40000), label=comma) +
    labs(title="CTA Ridership Boxplot By Rail Line, 2015-2017", x="Year", y="Rides") +
    theme(legend.position="bottom", plot.title = element_text(hjust=0.5, size=18))
```


## <span style="color:#336791">Correlation</span>

<div class="blue">
```{r}
sql <- 'WITH station_agg AS
         (SELECT DATE_PART(\'year\', r.ride_date)::integer AS "year",
                 r.station_id,
                 r.station_name,
                 COUNT(r.rides)::numeric(20,5) AS "count", 
                 SUM(r.rides)::numeric(20,5) AS "sum", 
                 AVG(r.rides)::numeric(20,5) AS "mean", 
                 MEDIAN(r.rides)::numeric(20,5) AS "median",
                 MIN(r.rides)::numeric(20,5) AS "min", 
                 MAX(r.rides)::numeric(20,5) AS "max"
          FROM rail_rides r
          GROUP BY DATE_PART(\'year\', r.ride_date),
                   r.station_id,
                   r.station_name
          ),
                   
      merge_rail AS
         (SELECT s.*, 
                 r.rail_line,
                 (s."sum" / COUNT(*) OVER(PARTITION BY s.station_id, "year")) AS rail_total
          FROM station_agg s
          INNER JOIN rail_stations r ON s.station_id = r.station_id
         )
         
      SELECT m."year",
             SUM(rail_total) FILTER (WHERE rail_line = \'blue\') AS blue,
             SUM(rail_total) FILTER (WHERE rail_line = \'brown\') AS brown,
             SUM(rail_total) FILTER (WHERE rail_line = \'green\') AS green,
             SUM(rail_total) FILTER (WHERE rail_line = \'orange\') AS orange,
             SUM(rail_total) FILTER (WHERE rail_line = \'pink\') AS pink,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple\') AS purple,
             SUM(rail_total) FILTER (WHERE rail_line = \'purple_exp\') AS purple_exp,
             SUM(rail_total) FILTER (WHERE rail_line = \'red\') AS red,
             SUM(rail_total) FILTER (WHERE rail_line = \'yellow\') AS yellow
      FROM merge_rail m
      GROUP BY m."year"
      ORDER BY m."year"'

wide_sql <- dbGetQuery(conn, sql)

wide_sql
```
</div>

<div class="blue">
```{r}
cor(wide_sql[-1], use = "complete.obs", method="pearson")
```
</div>

## <span style="color:#336791">T-tests</span>

<div class="blue">
```{r}
combns <- as.list(outer(rail_lines, rail_lines, function(x, y) ifelse(x==y, NA, paste(x, y))))
combns <- lapply(combns[!is.na(combns)], function(x) strsplit(x, split=" ")[[1]])

ttest_matrix <- sapply(combns, function(x){
  t <- t.test(wide_sql[[x[1]]], wide_sql[[x[2]]])
  c(statistic = t$statistic, p_value = t$p.value)
  
})

ttest_df <- data.frame(x = sapply(combns, "[", 1),
                       y = sapply(combns, "[", 2),
                       statistic = ttest_matrix[1,],
                       p_value = ttest_matrix[2,])

ttest_df <- with(ttest_df, ttest_df[order(x, y),])
ttest_df
```


```{r fig.height = 4, fig.width = 10, fig.align = "center"}

by(ttest_df, ttest_df$x, function(sub)
  
  ggplot(sub, aes(x, statistic, fill=y)) + geom_col(position = "dodge") +
    labs(title=paste0("CTA System 'L' Lines - Pairwise T-tests for ", 
                     toupper(substr(sub$x[[1]], 1, 1)), 
                     substr(sub$x[[1]], 2, nchar(as.character(sub$x[[1]]))), " Line"), 
         x="Rail Line", y="T-test Stat") +
    scale_x_discrete(expand = c(0,0)) + 
    scale_y_continuous(expand = c(0, 0), label=comma) +
    scale_fill_manual(values = cta_palette[!names(cta_palette)==sub$x[[1]]]) + 
    guides(fill=guide_legend("Rail Lines", nrow=1)) +
    theme(legend.position="bottom",
          plot.title = element_text(hjust=0.5, size=18, colour=cta_palette[names(cta_palette)==sub$x[[1]]]),
          axis.text.x = element_text(angle=0, hjust=0.5))
  )

```
</div>

## <span style="color: #646464">CSV Regression</span> ##

### <span style="color: #646464">Right-Hand Side Variables</span> ##

- ### <span style="color: #646464">Again, requires maintenance and storage on disk space</span> ###
- ### <span style="color: #646464">Again, requires load time from non-centralized paths</span> ###
- ### <span style="color: #646464">Again, requires any ad-hoc munging for usability</span> ###

```{r}

# Source: St. Louis Federal Reserve Bank
unemployment_df <- read.csv('Chicago_Unemployment_Rates.csv')
unemployment_df$Date <- as.Date(unemployment_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(unemployment_df)

# Source: U.S. Department of Energy: EIA
gas_prices_df <- read.csv('US_Gas_Prices.csv')
gas_prices_df$Date <- as.Date(gas_prices_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(gas_prices_df)

# Source: U.S. National Oceanic and Atmospheric Administration (NOAA)
weather_df <- read.csv('Chicago_Weather_Data.csv')
weather_df$Date <- as.Date(weather_df$Date, format='%Y-%m-%d', origin='1970-01-01')
head(weather_df)
```


## <span style="color: #646464">Bus Model Data</span>

- ### <span style="color: #646464">Echoes set-based joins</span> ###
- ### <span style="color: #646464">Repetitive sourcing  of same object</span> ###
- ### <span style="color: #646464">Nested dense processing of steps</span> ###

```{r}
bus_model_data <- merge(unique(bus_month_csv[c("route", "routename")]), bus_rides_csv, by="route")

bus_model_data <- merge(bus_model_data, unemployment_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, gas_prices_df, by.x='date', by.y='Date')
bus_model_data <- merge(bus_model_data, weather_df, by.x='date', by.y='Date')

head(bus_model_data)
```


### <span style="color: #646464">Add Seasons</span>

```{r}
bus_model_data$normalized_dt <- as.POSIXlt(bus_model_data$date)
bus_model_data$normalized_dt$year <- bus_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(bus_model_data$date, "%Y")))
bus_model_data$normalized_dt <- as.Date(bus_model_data$normalized_dt)


bus_model_data$season <- ifelse(bus_model_data$normalized_dt >= '2099-01-01' & 
                                   bus_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(bus_model_data$normalized_dt >= '2099-03-20' & 
                                          bus_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(bus_model_data$normalized_dt >= '2099-06-20' & 
                                                 bus_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(bus_model_data$normalized_dt >= '2099-09-20' & 
                                                        bus_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(bus_model_data$normalized_dt >= '2099-12-20' & 
                                                              bus_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
bus_model_data[sample(nrow(bus_model_data), 10), c("normalized_dt", "date", "season")]
bus_model_data$normalized_dt <- NULL
```

## <span style="color: #646464">Bus Modeling (Ordinary Least Squares)</span>

```{r}
model <- lm(rides ~ daytype + season + UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth,
            data = bus_model_data)

summary(model)
```

## <span style="color: #646464">Rail Model Data</span>

### <span style="color: #646464">Assign Latitude and Longitude</span>

```{r}

rail_long$latitude <- as.numeric(gsub("\\(", "", gsub(",", "", sapply(as.character(rail_long$location), 
                                                         function(x) strsplit(x, split="\\s+")[[1]][1])))
                                )

rail_long$longitude <- as.numeric(gsub("\\)", "", sapply(as.character(rail_long$location), 
                                            function(x) strsplit(x, split="\\s+")[[1]][2]))
                                  )

rail_long[sample(nrow(rail_long), 10), c("location", "latitude", "longitude")]
```

```{r}
rail_model_data <- merge(rail_long, rail_rides_csv, by="station_id")

rail_model_data$raw <- rail_model_data$rides

rail_model_data$rides <- with(rail_model_data, rides /
                                  ave(station_id, station_id, date, FUN=length))

rail_model_data <- merge(rail_model_data, unemployment_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, gas_prices_df, by.x='date', by.y='Date')
rail_model_data <- merge(rail_model_data, weather_df, by.x='date', by.y='Date')

head(rail_model_data, 10)
```


### <span style="color: #646464">Add Seasons</span>

```{r}
rail_model_data$normalized_dt <- as.POSIXlt(rail_model_data$date)
rail_model_data$normalized_dt$year <- rail_model_data$normalized_dt$year +
                                        (2099 - as.integer(format(rail_model_data$date, "%Y")))
rail_model_data$normalized_dt <- as.Date(rail_model_data$normalized_dt)


rail_model_data$season <- ifelse(rail_model_data$normalized_dt >= '2099-01-01' & 
                                   rail_model_data$normalized_dt  < '2099-03-19', 'winter',
                                ifelse(rail_model_data$normalized_dt >= '2099-03-20' & 
                                          rail_model_data$normalized_dt  < '2099-06-19', 'spring',
                                       ifelse(rail_model_data$normalized_dt >= '2099-06-20' & 
                                                 rail_model_data$normalized_dt  < '2099-09-19', 'summer',
                                              ifelse(rail_model_data$normalized_dt >= '2099-09-20' & 
                                                        rail_model_data$normalized_dt  < '2099-12-19', 'fall',
                                                     ifelse(rail_model_data$normalized_dt >= '2099-12-20' & 
                                                              rail_model_data$normalized_dt  < '2099-12-31', 'winter',
                                                            NA)
                                              )
                                       )
                                )
                         )
                                       
rail_model_data[sample(nrow(rail_model_data), 10), c("normalized_dt", "date", "season")]
rail_model_data$normalized_dt <- NULL

```



## <span style="color: #646464">Rail Modeling (Ordinary Least Squares)</span>

```{r}
model <- lm(rides ~ daytype + season + latitude + longitude + rail_line + 
                    UE_Rate + Gas_Price + AvgTemp + Precipitation + SnowDepth, 
            data = rail_model_data)

summary(model)
```

## <span style="color:#336791">SQL Regression</span>

- ### <span style="color:#336791">Advanced preparation of data</span> ###
- ### <span style="color:#336791">Materialized view facilitates reproducible research</span> ###
- ### <span style="color:#336791">Compact and straightforward data sourcing</span> ###

## <span style="color:#336791">Bus Modeling Data</span>

<div class="blue">
```{sql, eval=FALSE}
CREATE MATERIALIZED VIEW Bus_Model_Data AS
    SELECT b.id, b.route_id, b.ride_date, b.day_type, b.rides, r.route_name, 
           CASE 
               WHEN b.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN b.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN b.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN b.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN b.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
     (
      SELECT id, route_id, day_type, rides, ride_date, 
             ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
      FROM bus_rides
     ) b
    INNER JOIN bus_routes r ON b.route_id = r.route_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = b.ride_date
    INNER JOIN gas_prices g ON g.gas_date = b.ride_date
    INNER JOIN weather_data w ON w.weather_date = b.ride_date
    ORDER BY b.ride_date, NULLIF(regexp_replace(b.route_id, '\D', '', 'g'), '')::int;
    
REFRESH MATERIALIZED VIEW Bus_Model_Data;
```

```{r}
bus_model_data <- dbGetQuery(conn, "SELECT * FROM bus_model_data")

head(bus_model_data)
```
</div>

## <span style="color:#336791">Bus Modeling (Ordinary Least Squares)</span>

<div class="blue">
```{r}
model <- lm(rides ~ day_type + season + ue_rate + gas_price + avg_temp + precipitation + snow_depth,
            data = bus_model_data)

summary(model)
```
</div>

## <span style="color:#336791">Rail Modeling Data</span>

<div class="blue">
```{sql, eval=FALSE}
CREATE MATERIALIZED VIEW Rail_Model_Data AS
    SELECT r.id, r.station_id, r.station_name, r.ride_date, r.day_type, r.rides AS raw, 
          (r.rides / COUNT(*) OVER(PARTITION BY r.station_id, r.ride_date)) AS rides,
          CASE 
               WHEN r.normalized_date BETWEEN '2099-01-01' AND '2099-03-19' THEN 'winter'
               WHEN r.normalized_date BETWEEN '2099-03-20' AND '2099-06-19' THEN 'spring'
               WHEN r.normalized_date BETWEEN '2099-06-20' AND '2099-09-19' THEN 'summer'
               WHEN r.normalized_date BETWEEN '2099-09-20' AND '2099-12-19' THEN 'fall'
               WHEN r.normalized_date BETWEEN '2099-12-20' AND '2099-12-31' THEN 'winter'
               ELSE NULL
           END As season,        
           REPLACE(REPLACE((regexp_split_to_array(s.location, '\s+'))[1], ',', ''), '(', '')::numeric AS latitude,
           REPLACE((regexp_split_to_array(s.location, '\s+'))[2], ')', '')::numeric AS longitude,
           s.rail_line, s.ada, s.direction,
           ue.ue_rate, g.gas_price, w.avg_temp, w.precipitation, w.snow_depth
    FROM 
       (
        SELECT id, station_id, station_name, day_type, rides, ride_date, 
               ride_date + (2099 - date_part('year', ride_date)  ||' year')::interval as normalized_date
        FROM rail_rides
       )r
    INNER JOIN rail_stations s ON s.station_id = r.station_id
    INNER JOIN unemployment_rates ue ON ue.ue_date = r.ride_date
    INNER JOIN gas_prices g ON g.gas_date = r.ride_date
    INNER JOIN weather_data w ON w.weather_date = r.ride_date
    ORDER BY r.ride_date, r.station_id;
    
REFRESH MATERIALIZED VIEW Rail_Model_Data;
```

```{r}
rail_model_data <- dbGetQuery(conn, "SELECT * FROM rail_model_data")

head(rail_model_data)
```
</div>

## <span style="color:#336791">Rail Modeling (Ordinary Least Squares)</span>

<div class="blue">
```{r}
model <- lm(rides ~ day_type + season + latitude + longitude + rail_line + 
                    ue_rate + gas_price + avg_temp + precipitation + snow_depth, 
            data = rail_model_data)

summary(model)
```
</div>

```{r}
# DISCONNECT FROM DATABASE
dbDisconnect(conn)
```

## <span style="color:#336791">Conclusions</span> ##

- ### <span style="color:#336791">Postgres provides a stable, centralized, repository for data sourcing</span> ###
- ### <span style="color:#336791">Postgres maintains an expressive SQL dialect for data processing</span> ###
- ### <span style="color:#336791">Postgres supports data science with vectorized methods and reproducibility</span> ###


